Ejemplo de libro
#ejemplo de libro
sff<- load454SampleData()
## Total number of reads to be read: 1000
## reading header for sff file:/home/sergio/R/x86_64-pc-linux-gnu-library/3.3/rSFFreader/extdata/Small454Test.sff
## reading file:/home/sergio/R/x86_64-pc-linux-gnu-library/3.3/rSFFreader/extdata/Small454Test.sff
##Generate some QA plots:
##Read length histograms:
par(mfrow=c(2,2))
clipMode(sff) <- "raw"
hist(width(sff),breaks=500,col="grey",xlab="Read Length",main="Raw Read Length")
## Base by position plots:
clipMode(sff) <- "raw"
ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
ac.reads <- apply(ac,2,sum)
acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
main="Base by position")
cols <- c("green","blue","black","red","darkgrey","purple")
leg <- c("A","C","T","G","N","%reads")
legend("topright", col=cols, legend=leg, pch=18, cex=.8)
clipMode(sff) <- "full"
hist(width(sff),breaks=500,col="grey",xlab="Read Length",main="Clipped Read Length")
ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
ac.reads <- apply(ac,2,sum)
acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
main="Base by position")
legend("topright", col=cols, legend=leg, pch=18, cex=.8)

SIN CLIP - RAW DATA ANALYSIS
samples<-list.files("./samples")
lenStat_raw<-vector()
lenStat_qual<-vector()
nucFreq_raw<-matrix(nrow=1000, ncol=1)
nucFreq_qual<-matrix(nrow=1000, ncol=1)
count_raw<-data.frame()
count_qual<-data.frame()
len_raw<-list()
len_qual<-list()
for(i in samples){
sff<-readSff(paste("samples/",i,sep=""), use.qualities=TRUE, use.names=TRUE,clipMode = c("raw"), verbose=TRUE)
##Generate some QA plots:
##Read length histograms (with and without clipping):
# RAW
par(mfrow=c(2,2))
clipMode(sff) <- "raw"
hist(width(sff),breaks=500,col="grey",xlab="Read Length",
xlim= c(50,800), main= "RAW read length")
lenStat_raw<-rbind(lenStat_raw, c(gsub("454Reads.MID_","",gsub(".sff","",i)), summary(width(sff)) ) )
count_raw[i,1]<-length(sff)
len_raw[[i]]<-width(sff)
## Base by position plots:
ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
ac.reads <- apply(ac,2,sum)
acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
tacf<-t(acf); colnames(tacf)<-paste(gsub("454Reads.MID_","",gsub(".sff","",i)),"_", c("A","C","T","G","N"),sep="")
nucFreq_raw<-cbindX(nucFreq_raw,tacf)
matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
main="Base by position", xlim=c(0,1500))
cols <- c("green","blue","black","red","darkgrey","purple")
leg <- c("A","C","T","G","N","%reads")
legend("topright", col=cols, legend=leg, pch=18, cex=.8)
### FILTER QUALITY
clipMode(sff) <- "full"
hist(width(sff),breaks=500,col="grey",xlab="Read Length",
xlim= c(50,800),main="CLIPPED read length" )
lenStat_qual<-rbind(lenStat_qual,c(gsub("454Reads.MID_","",gsub(".sff","",i)), summary(width(sff)) ) )
count_qual[i,1]<-length(sff)
len_qual[[i]]<-width(sff)
## Base by position plots:
ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
ac.reads <- apply(ac,2,sum)
acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
tacf<-t(acf); colnames(tacf)<-paste(gsub("454Reads.MID_","",gsub(".sff","",i)),"_", c("A","C","T","G","N"),sep="")
nucFreq_qual<-cbindX(nucFreq_qual,tacf)
matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
main="Base by position", xlim=c(0,1000))
par(mfrow=c(1,1))
legend("topright", col=cols, legend=leg, pch=18, cex=.8)
title(paste("sample: ",gsub("454Reads.MID_","",gsub(".sff","",i)),sep=""))
dev.copy(png,filename=paste("Explo_",gsub("454Reads.MID_","",gsub(".sff","",i)),".png",sep=""));
dev.off ();
}
## Total number of reads to be read: 2233
## reading header for sff file:samples/454Reads.MID_s01_rs3.sff
## reading file:samples/454Reads.MID_s01_rs3.sff

## Total number of reads to be read: 2405
## reading header for sff file:samples/454Reads.MID_s02_rs3.sff
## reading file:samples/454Reads.MID_s02_rs3.sff

## Total number of reads to be read: 2054
## reading header for sff file:samples/454Reads.MID_s03_rs5.sff
## reading file:samples/454Reads.MID_s03_rs5.sff

## Total number of reads to be read: 11821
## reading header for sff file:samples/454Reads.MID_s04_rl3.sff
## reading file:samples/454Reads.MID_s04_rl3.sff

## Total number of reads to be read: 10580
## reading header for sff file:samples/454Reads.MID_s05_rl3.sff
## reading file:samples/454Reads.MID_s05_rl3.sff

## Total number of reads to be read: 8869
## reading header for sff file:samples/454Reads.MID_s06_rl5.sff
## reading file:samples/454Reads.MID_s06_rl5.sff

## Total number of reads to be read: 9255
## reading header for sff file:samples/454Reads.MID_s07_dl3.sff
## reading file:samples/454Reads.MID_s07_dl3.sff

## Total number of reads to be read: 9579
## reading header for sff file:samples/454Reads.MID_s08_dl3.sff
## reading file:samples/454Reads.MID_s08_dl3.sff

## Total number of reads to be read: 9987
## reading header for sff file:samples/454Reads.MID_s09_dl5.sff
## reading file:samples/454Reads.MID_s09_dl5.sff

EstadÃsticas de las lecturas
#Statistics about lenght
as.data.frame(lenStat_raw)
## V1 Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 s01_rs3 198 577 603 623.1 648 1093
## 2 s02_rs3 193 572 600 622.5 652 1199
## 3 s03_rs5 200 603 644 654.4 691 1121
## 4 s04_rl3 533 686 694 695.8 702 929
## 5 s05_rl3 499 687 695 697.3 704 1133
## 6 s06_rl5 488 696 703 703 711 1779
## 7 s07_dl3 526 687 695 697.7 704 1477
## 8 s08_dl3 509 683 690 692.2 698 1211
## 9 s09_dl5 487 695 702 701.1 710 851
write.csv(as.data.frame(lenStat_raw), "Raw_Readlength_stats.csv",row.names = FALSE )
as.data.frame(lenStat_qual)
## V1 Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 s01_rs3 30 247 310 301.4 364 540
## 2 s02_rs3 30 244 300 294.3 348 524
## 3 s03_rs5 30 219 261 256.7 329 512
## 4 s04_rl3 379 613 626 598.7 630 668
## 5 s05_rl3 390 615 627 600.3 631 1114
## 6 s06_rl5 382 560 635 596.2 636 710
## 7 s07_dl3 382 618 628 602.3 631 664
## 8 s08_dl3 381 622 627 603.1 630 683
## 9 s09_dl5 382 568 635 601.1 636 654
write.csv(as.data.frame(lenStat_qual), "Clip_Readlength_stats.csv",row.names = FALSE )
#Nucleotide frequency by position
write.csv(nucFreq_raw, "Raw_NuclFreqByPos.csv",row.names = FALSE )
write.csv(nucFreq_qual, "Clip_NuclFreqByPos.csv",row.names = FALSE )
#All length data by sample
count_raw$sample<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_raw)))
count_qual$sample<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_qual)))
length_raw<-do.call(cbindX, lapply(len_raw, as.data.frame))
colnames(length_raw)<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_raw)))
length_qual<-do.call(cbindX, lapply(len_qual, as.data.frame))
colnames(length_qual)<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_qual)))
rownames(count_raw)<-NULL;rownames(count_qual)<-NULL
write.csv(length_raw, "Raw_AllReadlength.csv",row.names = FALSE )
write.csv(length_qual, "Clip_AllReadlength.csv",row.names = FALSE )
#AMount of sequences by sample
count_raw
## V1 sample
## 1 2233 s01_rs3
## 2 2405 s02_rs3
## 3 2054 s03_rs5
## 4 11821 s04_rl3
## 5 10580 s05_rl3
## 6 8869 s06_rl5
## 7 9255 s07_dl3
## 8 9579 s08_dl3
## 9 9987 s09_dl5
write.csv(count_raw, "Raw_CountBySample.csv",row.names = FALSE )
count_qual
## V1 sample
## 1 2233 s01_rs3
## 2 2405 s02_rs3
## 3 2054 s03_rs5
## 4 11821 s04_rl3
## 5 10580 s05_rl3
## 6 8869 s06_rl5
## 7 9255 s07_dl3
## 8 9579 s08_dl3
## 9 9987 s09_dl5
write.csv(count_qual, "Clip_CountBySample.csv",row.names = FALSE )
#GRAFICO DE BARRAS CON LA CANTIDAD DE LECTURAS
#plot(cantidad$cantidad, type="b")
library(ggplot2)
p<-ggplot(count_raw, aes(x=sample, weight=V1))+ geom_bar(fill="#2b8cbe")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p)
#GRAFICO DE CAJAS
#boxplot(longitud)
p<-ggplot(data = melt(length_raw), aes(x=variable, y=value)) + geom_boxplot(aes(fill=variable))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+ ggtitle("Raw Reads")
ggplotly(p)
#GRAFICO DE CAJAS
#boxplot(longitud)
p<-ggplot(data = melt(length_qual), aes(x=variable, y=value)) + geom_boxplot(aes(fill=variable))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+ ggtitle("Clipped Reads")
ggplotly(p)